Go through all of the citations in the citation_data
directory, and extract from each one the list of authors. Save the authors into the citations.h5
database.
In [1]:
import pandas as pd
import json
import os
import sys
from unidecode import unidecode
from IPython.display import clear_output
In [2]:
# open the HDF5 database
store = pd.HDFStore("citations.h5", mode='w')
for filename in sorted(os.listdir("citation_data")):
#store_name = 'dois/{}'.format(os.path.splitext(filename)[0])
# skip citations that are already in the database
#if store_name in store:
# continue
# read the citation data
with open(os.path.join("citation_data", filename), 'r') as fh:
# skip citations that are malformed
try:
citation = json.load(fh)
except:
continue
# skip citations that don't have author information
try:
citation['bibliographic']['author']
except (TypeError, KeyError):
continue
# get the doi and the author information
doi = citation['bibliographic']['DOI']
df = pd.DataFrame(citation['bibliographic']['author']).dropna(how='all', axis=0)
df['doi'] = doi
# construct the place we will store it in the database (we need a fancy
# naming scheme because otherwise we have 10000 objects all under the same
# group -- we should take advantage of HDF5's hierarchical nature). Also
# we have to prefix the numbers with underscores othewise the names are not
# considered "natural" -- i.e. they are not valid python identifiers.
prefix = "_".join(doi.split(".")[:-1])
number = doi.split(".")[-1]
number_path = "/_".join(list(str(number))[:-2])
store_path = "/dois/_{}/_{}/_{}".format(prefix, number_path, number)
# print progress
clear_output()
print("Saving '{}' to '{}'".format(filename, store_path))
sys.stdout.flush()
# save into the database
store.put(store_path, df)
# close the HDF5 database
store.close()
In [3]:
# check the size of the database
!ls -lh citations.h5
In [4]:
# recompress the database so it's not quite so gigantic
!ptrepack --chunkshape=auto --propindexes --complevel=9 --complib=blosc citations.h5 citations-small.h5
In [5]:
# rename the compressed database
!mv citations-small.h5 citations.h5
In [6]:
# check the size of the database
!ls -lh citations.h5